#对boss直聘网站进行python爬虫岗位，北京地区前20页信息的提取，并存入csv文件
import requests
from bs4 import BeautifulSoup
import re
import time


headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,\
image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Host':'www.zhipin.com',
'Referer':'http://www.zhipin.com/job_detail/?query=\
python%E7%88%AC%E8%99%AB&scity=101010100&source=2',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
}
s = requests.Session()
time.sleep(3)#降低爬虫速度，以免被封停
for i in range(1,21):
	url = 'http://www.zhipin.com/c101010100/h_101010100/?\
	query=python%E7%88%AC%E8%99%AB&page='+str(i)+'&ka=page-'+str(i)
	html = s.get(url,headers=headers).text
	soup = BeautifulSoup(html,'html.parser')
	urls = soup.findAll('a',{'href':re.compile('/job_detail/\d*\\.html')})
	boss = []
	for url in urls:
		link = 'http://www.zhipin.com'+url['href']
		#print(link)
		html = s.get(link,headers=headers).text
		soup = BeautifulSoup(html,'html.parser')
		infos = soup.findAll('div',{'class':'info-primary'})
		companys = soup.findAll('div',{'class':'info-company'})
		secs = soup.findAll('div',{'class':'job-sec'})
		for info,company,sec in zip(infos,companys,secs):
			text = info.get_text()+company.get_text()+sec.get_text()
			with open('boss.csv','at',encoding='utf-8') as file_object:
				file_object.write(info.get_text()+company.get_text()+sec.get_text())
				#print('{:40}{:40}{:40}'.format(info.get_text(),company.get_text(),sec.get_text()))
